Quantitative notions of relevance: Data analysis

Author

Alex Warstadt, Omar Aghar & Michael Franke

First load required packages and set some global parameters.

Code
library(tidyverse)
library(brms)
library(tidyboot)
library(tidyjson)
library(patchwork)
library(GGally)
library(cowplot)
library(BayesFactor)
library(aida)   # custom helpers: https://github.com/michael-franke/aida-package
library(faintr) # custom helpers: https://michael-franke.github.io/faintr/index.html
library(cspplot) # custom styles: https://github.com/CogSciPrag/cspplot
library(ordbetareg) # for ordered beta-regression
library(cmdstanr)


##################################################

# these options help Stan run faster
options(mc.cores = parallel::detectCores(),
        brms.backend = "cmdstanr")

# use the CSP-theme for plotting
theme_set(theme_csp())

# global color scheme from CSP
project_colors = cspplot::list_colors() |> pull(hex)

# setting theme colors globally
scale_colour_discrete <- function(...) {
  scale_colour_manual(..., values = project_colors)
}
scale_fill_discrete <- function(...) {
  scale_fill_manual(..., values = project_colors)
}

##################################################

rerun_models <- FALSE

Lambert_test <- function(loo_comp) {
  1 - pnorm(-loo_comp[2,1], loo_comp[2,2])
}

##################################################
rl_file <- "R_data_4_TeX/myvars-reproduce.csv"
myvars = list()
# add with: myvars[name] = value

Read & massage the data

Data preprocessing was done in Python. We here load the preprocessed data and rearrange for convenience.

Code
d <- read_csv("../results/round_2.0/results_preprocessed.csv") |> 
  # drop column with numbers
  select(-`...1`) |> 
  # set "non-answers" to AnswerPolarity "positive"
  mutate(AnswerPolarity = ifelse(
    AnswerCertainty == "non_answer", 
    "positive", 
    AnswerPolarity)) |> 
  # casting into factor
  mutate(
    group = factor(group, levels = c("relevant", "helpful")),
    ContextType = factor(ContextType, 
                         levels = c("negative", "neutral", "positive")),
    AnswerPolarity = factor(AnswerPolarity, 
                            levels = c("positive", "negative")),
    AnswerCertainty = factor(AnswerCertainty, 
                             levels = c("non_answer", "low_certainty", 
                                        "high_certainty", "exhaustive"))
  ) 

We explored several versions of various relevance metrics (as reported in the paper + appendix). Here we select only those versions of the relevance metrics used for the main analyses as reported in the paper:

Code
d <- d |>
  # columns to keep
  select(
    "submission_id",
    "group",
    "StimID",
    "AnswerCertainty",
    "AnswerPolarity",
    "ContextType",
    "attention_score",
    "reasoning_score",
    "prior_sliderResponse",
    "posterior_sliderResponse",
    "posterior_confidence",
    "prior_confidence",
    "relevance_sliderResponse",
    "first_order_belief_change",                   
    "second_order_belief_change",
    "pure_second_order_belief_change_scaled",
    "entropy_change_scaled",
    "beta_entropy_change_scaled",
    "kl_scaled",
    "beta_kl_scaled",
    "bayes_factor_utility",
    "beta_bayes_factor_utility_1_scaled"
    ) |> 
  rename(
    "ProbabilityChange" = "first_order_belief_change",
    "CommitmentChange" = "second_order_belief_change",
    "ConcentrationChange" = "pure_second_order_belief_change_scaled",
    "EntropyChange" = "entropy_change_scaled",
    "EntropyChange_2ndOrder" = "beta_entropy_change_scaled",
    "KLUtility" = "kl_scaled",
    "KLUtility_2ndOrder" = "beta_kl_scaled",
    "BayesFactor" = "bayes_factor_utility",
    "BayesFactor_2ndOrder" = "beta_bayes_factor_utility_1_scaled"
  )

Data exclusion

As per preregistered protocol, we exclude all data from participants who:

  • scored less than perfect on all attention checks,
  • scored less than 0.5 on reasoning tasks, or
  • have task-sensitivity of not more than 0.75

Task sensitivity is the proportion of critical trials (excluding non-answer trials) in which the change between prior and posterior rating was bigger than 0.05 or there was a non-zero change in confidence rating.

We first add the task-sensitivity score:

Code
d <- d |> as_tibble() |> 
  mutate(answer_class  = ifelse(AnswerCertainty != "non_answer", "answer", "non_answer"),
         belief_change = abs(prior_sliderResponse - posterior_sliderResponse) >= 0.05 |
                                prior_confidence != posterior_confidence,
         deviant = case_when(answer_class == "answer" ~ !belief_change,
                             answer_class != "answer" ~  FALSE)
         ) |> 
  group_by(submission_id) |> 
  mutate(task_sensitivity = 1- sum(deviant) / sum(answer_class == "answer")) |> 
  select(- answer_class, - belief_change, - deviant)

Then we apply our exclusion criteria:

Code
# initial number of participants
initial_nr_participants <- d |> pull(submission_id) |> unique() |> length()

d <- d |> 
  filter(attention_score == 1) |> 
  filter(reasoning_score > 0.5) |> 
  filter(task_sensitivity > 0.75)

# included participants
included_nr_participants <- d |> pull(submission_id) |> unique() |> length()

message("Initial number of participants: ", initial_nr_participants, 
        "\nIncluded after cleaning: ", included_nr_participants,
        "\nExcluded: ", initial_nr_participants - included_nr_participants)

Exploring the effect of the experimental factors

The experiment had the following main factors:

  • ContextType: whether the context made a ‘no’ or a ‘yes’ answer more likely a priori or whether it was neutral (within-subjects)
  • AnswerCertainty: how much information the answer provides towards a fully resolving answer (within-subjects)
  • AnswerPolarity: whether the answer suggests or implies a ‘no’ or a ‘yes’ answer (within-subjects)
    • ‘non-answers’ are treated as ‘positive’ for technical purposes, but this does not influence relevant analyses

In the following, we first check whether these experimental manipulations worked as intended.

Sanity-checking whether the manipulations worked as intended

Effects of ContextType on prior and prior confidence

To check whether the ContextType manipulation worked, we compare how participants rated the prior probability of a ‘yes’ answer under each level of the ContextType variable. Concretely, we expect this order of prior ratings for the levels of ContextType: negative < neutral < positive. Although we have no specific hypotheses or sanity-checking questions regarding the confidence ratings, let’s also scrutinize the confidence ratings that participants gave with their prior ratings.

Prior ratings as a function of ContextType

Here is a first plot addressing the question after an effect of ContextType on participants prior ratings.

Code
d |> ggplot(aes(x = prior_sliderResponse, color = ContextType, fill = ContextType)) +
  geom_density(alpha = 0.3, linewidth = 1.5) + 
  xlab("prior rating") +
  ylab("")

We dive deeper by fitting a regression model, predicting prior ratings in terms of the ContextType. Since participants have not seen the answer when they rate the prior probability of a ‘yes’ answer, ContextType is the only fixed effect we should include here. The model also includes the maximal RE structure. We use the ordbetareg package for (slider-data appropriate) zero-one inflated ordinal beta regression.

Code
if (rerun_models) {
  fit_contextType_SC <- ordbetareg::ordbetareg(
    prior_sliderResponse ~ ContextType + 
      (1 + ContextType | StimID) + 
      (1 + ContextType | submission_id),
    data = d,
    iter = 3000 
  )
  saveRDS(fit_contextType_SC, "cachedModels-round2/fit_contextType_SC.Rds")
} else{
  fit_contextType_SC <- readRDS("cachedModels-round2/fit_contextType_SC.Rds")
}

Our assumption is that prior ratings are higher in contexts classified as ‘neutral’ than in ‘negative’ contexts, and yet higher in ‘positive’ contexts. We use the faintr package to extract information on these directed comparisons.

Code
ContextType_SC_negVSneu <- 
  faintr::compare_groups(
    fit_contextType_SC, 
    lower = ContextType == "negative",
    higher = ContextType == "neutral" 
  )
ContextType_SC_neuVSpos <- 
  faintr::compare_groups(
    fit_contextType_SC, 
    lower = ContextType == "neutral",
    higher = ContextType == "positive"
  )

Prior confidence as a function of ContextType

Here is a visualization of the effect of ContextType on participants’ confidence in their prior ratings.

Code
d |> mutate(prior_confidence = factor(prior_confidence, levels = 1:7)) |> 
  ggplot(aes(x = prior_confidence, color = ContextType, fill = ContextType)) +
  geom_histogram(position = "dodge", stat='count') + 
  xlab("prior confidence") +
  ylab("")

To scrutinize the effect of ContextType on participants expressed confidence in their prior ratings, we use a ordered-logit (cumulative logit) regression (since prior confidence ratings are from a rating scale).

Code
if (rerun_models) {
  fit_contextType_SC_conf <- brm(
    prior_confidence ~ ContextType + 
      (1 + ContextType | StimID) + 
      (1 + ContextType | submission_id),
    data = d,
    family = cumulative(),
    iter = 3000
  )
  saveRDS(fit_contextType_SC_conf, "cachedModels-round2/fit_contextType_SC_conf.Rds")
} else{
  fit_contextType_SC_conf <- readRDS("cachedModels-round2/fit_contextType_SC_conf.Rds")
}
Code
ContextType_SC_neuVSneg_conf <- 
  compare_groups(
    fit_contextType_SC_conf, 
    lower = ContextType == "neutral",
    higher = ContextType == "negative" 
  )
ContextType_SC_negVSpos_conf <- 
  compare_groups(
    fit_contextType_SC_conf, 
    lower = ContextType == "negative",
    higher = ContextType == "positive"
  )
ContextType_SC_neuVSpos_conf <- 
  compare_groups(
    fit_contextType_SC_conf, 
    lower = ContextType == "neutral",
    higher = ContextType == "positive"
  )

Results

The results of these comparisons are summarized here:

comparison measure posterior HDI-low HDI-high
negative < neutral prior 0.9986667 0.278882 1.149290
neutral < positive prior 1.0000000 0.528150 1.551521
neutral < negative prior-confidence 0.9900000 0.176561 1.289040
negative < positive prior-confidence 0.8025000 -0.485097 1.177040
neutral < positive prior-confidence 0.9965000 0.355817 1.716866

The ContextType manipulation seems to have worked as expected for the prior ratings: lower in ‘negative’ than in ‘neutral’ than in ‘positive’. As for the confidence ratings, the ContextType manipulation seems to have induced lower confidence for the ‘neutral’ condition than for the ‘negative’ and ‘positive’ condition.

Effects of AnswerPolarity and AnswerCertainty on beliefChange

We can define beliefChange as the difference between posterior and prior in the direction expected from the answer’s polarity (posterior belief in ‘yes’ answer increases for a ‘positive’ answer when compared with the prior rating, but it decreases for ‘negative’ answers). (Careful: we ignore non-answers (which are categorized as “positive” for technical convenience only).) If our manipulation worked, we expect the following for both ‘positive’ and ‘negative’ polarity:

  1. beliefChange is > 0
  2. beliefChange is lower for ‘low certainty’ than for ‘high certainty’ than for ‘exhaustive’

Here is a plot visually addressing these issues:

Code
d |> filter(AnswerCertainty != "non_answer") |> 
  mutate(beliefChange = posterior_sliderResponse - prior_sliderResponse,
         beliefChange = ifelse(AnswerPolarity == "positive", beliefChange, - beliefChange)) |> 
  mutate(AnswerCertainty = case_when(AnswerCertainty == "low_certainty" ~ "low",
                                     AnswerCertainty == "high_certainty" ~ "high",
                                     TRUE ~ "exhaustive",
                                     ) |> factor(level = c("low", "high","exhaustive"))) |> 
  ggplot(aes(x = beliefChange, color = AnswerCertainty, fill = AnswerCertainty)) +
  geom_density(alpha = 0.2, linewidth=2) +
  facet_grid(. ~ AnswerPolarity) +
  xlab("belief change (in expected direction)") +
  ylab("") + theme_aida()

Code
ggsave("plots/belief-change.pdf", scale = 1.2, height = 4, width = 10)

beliefChange is positive

To address the first issue, whether beliefChange is positive for both types of polartiy, we first regress beliefChange against the full list of potentially relevant factors, including plausible RE structures. Notice that at the time of answer the questions related to the posterior, participants have not yet seen the question after relevance or helpfulness, so that factor group should be ommitted.

Code
if (rerun_models) {
  fit_answer_SC <- brm(
    formula = beliefChange ~ ContextType * AnswerCertainty * AnswerPolarity +
      (1 + ContextType + AnswerCertainty + AnswerPolarity | StimID) +
      (1 + ContextType + AnswerCertainty + AnswerPolarity | submission_id),
    data = d |> filter(AnswerCertainty != "non_answer") |> 
      mutate(beliefChange = posterior_sliderResponse - prior_sliderResponse,
             beliefChange = ifelse(AnswerPolarity == "positive", beliefChange, - beliefChange)),
    # adapt delta
    control = list(adapt_delta = 0.85)
  )
  saveRDS(fit_answer_SC, "cachedModels-round2/fit_answer_SC.Rds")
} else{
  fit_answer_SC <- readRDS("cachedModels-round2/fit_answer_SC.Rds")
}

We check if inferred cell means are credibly bigger than zero, for all six relevant design cells (facets in the plot above).

Code
# Check if belief change in each cell is bigger than zero
# - first exctract samples from the posterior estimates of means of relevant conditions
cellDraws_answers <- cbind(
  filter_cell_draws(fit_answer_SC, AnswerCertainty == "low_certainty"  & AnswerPolarity == "positive", "low_pos")[1],
  filter_cell_draws(fit_answer_SC, AnswerCertainty == "high_certainty" & AnswerPolarity == "positive", "high_pos")[1],
  filter_cell_draws(fit_answer_SC, AnswerCertainty == "exhaustive"     & AnswerPolarity == "positive", "exh_pos")[1],
  filter_cell_draws(fit_answer_SC, AnswerCertainty == "low_certainty"  & AnswerPolarity == "negative", "low_neg")[1],
  filter_cell_draws(fit_answer_SC, AnswerCertainty == "high_certainty" & AnswerPolarity == "negative", "high_neg")[1],
  filter_cell_draws(fit_answer_SC, AnswerCertainty == "exhaustive"     & AnswerPolarity == "negative", "exh_neg")[1]
) 

# all posterior 95% HDIs are way above 0 
apply( cellDraws_answers |> as.matrix(), MARGIN = 2, aida::summarize_sample_vector)
$low_pos
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""        0.0861 0.131  0.175

$high_pos
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""         0.204 0.256  0.311

$exh_pos
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""         0.397 0.447  0.496

$low_neg
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""        0.0917 0.176  0.246

$high_neg
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""         0.209 0.281  0.358

$exh_neg
# A tibble: 1 × 4
  Parameter `|95%`  mean `95%|`
  <chr>      <dbl> <dbl>  <dbl>
1 ""         0.296 0.363  0.426
Code
# posterior probability of mean bigger 0 for each cell is almost 1 everywhere
apply(as.matrix(cellDraws_answers), MARGIN=2, function(x) {mean(x>0)})
 low_pos high_pos  exh_pos  low_neg high_neg  exh_neg 
 1.00000  1.00000  1.00000  0.99925  1.00000  1.00000 

These results suggest that there is little reason to doubt that the belief changes induces by the answers -as per the experimentally intended manipulation- went in the right direction in all cases.

beliefChange increases with more informative answers

Finally, we investigate whether beliefChange increases with more informative answers, using the same regression model as before.

Code
AnswerPolarity_main <- compare_groups(
  fit_answer_SC,
  lower = AnswerPolarity == "positive",
  higher  = AnswerPolarity == "negative"
)

AnswerCertainty_lowVShigh <- compare_groups(
  fit_answer_SC,
  lower   = AnswerCertainty == "low_certainty",
  higher  = AnswerCertainty == "high_certainty"
)

AnswerCertainty_highVSexh <- compare_groups(
  fit_answer_SC,
  lower   = AnswerCertainty == "high_certainty",
  higher  = AnswerCertainty == "exhaustive"
)
comparison measure posterior HDI (low) HDI (high)
pos vs neg polarity belief change 0.45525 -0.0810213 0.0665940
low vs high certainty belief change 0.99975 0.0658995 0.1681333
high certain vs exh belief change 0.99900 0.0691567 0.2003228

We see no indication of a main effect of polarity, but find support for the idea that our manipulation of AnswerCertainty induced gradually larger belief changes. I sum, it seems that the stimuli were adequately created to implement the intended manipulation in the variables AnswerCertainty and AnswerPolarity.

Predicting relevance in terms of the experimental factors

We want to explore how relevance ratings depend on the experimental manipulations. We therefore investigate the effects which variables AnswerCertainty AnswerPolarity and ContextType have on relevance ratings, starting with a visualization:

Code
d |> 
  mutate(AnswerCertainty = case_when(AnswerCertainty == "low_certainty" ~ "low",
                                     AnswerCertainty == "high_certainty" ~ "high",
                                     AnswerCertainty == "non_answer" ~ "non-answer",
                                     TRUE ~ "exhaustive",
                                     ) |> factor(level = c("non-answer", "low", "high","exhaustive"))) |> 
  ggplot(aes(x = relevance_sliderResponse, color = AnswerPolarity, fill = AnswerPolarity)) +
  facet_grid(AnswerCertainty ~ ContextType , scales = "free") +
  geom_density(alpha = 0.2, linewidth = 1.5) +
  xlab("relevance rating") + ylab("")

Code
ggsave("plots/relevance-ratings.pdf", scale=1.2, width = 10, height=6)

Fitting the regression model, predicting relevance in terms of our main experimental factors:

Code
if (rerun_models) {

  fit_relevance_expFactors <- ordbetareg::ordbetareg(
    relevance_sliderResponse ~ ContextType * AnswerCertainty * AnswerPolarity +
      (1 + ContextType + AnswerCertainty + AnswerPolarity || StimID) +
      (1 + ContextType + AnswerCertainty + AnswerPolarity || submission_id),
    data = d,
    coef_prior_SD = 5,
    save_pars = save_pars(all=TRUE)
  )
  saveRDS(fit_relevance_expFactors, "cachedModels-round2/fit_relevance_expFactors.Rds")
} else {
  fit_relevance_expFactors <- read_rds("cachedModels-round2/fit_relevance_expFactors.Rds")
}

Let’s now look at a bunch of contrasts (based on the previously fitted full model).

Code
## expected ordering relation?

## non-answers vs low-certainty => poster = 1
nonAns_VS_low  <- compare_groups(
  fit_relevance_expFactors,
  lower  = AnswerCertainty == "non_answer",
  higher = AnswerCertainty == "low_certainty"
)
## low-certainty vs high-certainty => poster = 0.9922
low_VS_high <- compare_groups(
  fit_relevance_expFactors,
  lower  = AnswerCertainty == "low_certainty",
  higher = AnswerCertainty == "high_certainty"
)
## high-certainty vs exhaustive => poster = 1
high_VS_exh <- compare_groups(
  fit_relevance_expFactors,
  lower  = AnswerCertainty == "high_certainty",
  higher = AnswerCertainty == "exhaustive"
)

## effects of AnswerPolarity?

AnswerPolarity_main <- compare_groups(
  fit_relevance_expFactors,
  lower  = AnswerPolarity == "positive" & AnswerCertainty != "non_answer",
  higher = AnswerPolarity == "negative" & AnswerCertainty != "non_answer"
)

AnswerPolarity_lowCertain <- compare_groups(
  fit_relevance_expFactors,
  lower  = AnswerPolarity == "positive" & AnswerCertainty == "low_certainty",
  higher = AnswerPolarity == "negative" & AnswerCertainty == "low_certainty"
)

AnswerPolarity_highCertain <-compare_groups(
  fit_relevance_expFactors,
  lower  = AnswerPolarity == "positive" & AnswerCertainty == "high_certainty",
  higher = AnswerPolarity == "negative" & AnswerCertainty == "high_certainty"
)

AnswerPolarity_exhaustive <-compare_groups(
  fit_relevance_expFactors,
  lower  = AnswerPolarity == "positive" & AnswerCertainty == "exhaustive",
  higher = AnswerPolarity == "negative" & AnswerCertainty == "exhaustive"
)

ContextType_neutral_negative <- 
  compare_groups(
    fit_relevance_expFactors, 
    higher = ContextType == "neutral", 
    lower = ContextType == "negative"
    )

ContextType_neutral_positive <-
  compare_groups(
    fit_relevance_expFactors,
    higher = ContextType == "neutral",
    lower = ContextType == "positive"
  )


cellComparisons <- tribble(
  ~comparison, ~measure, ~posterior, ~"HDI (low)", ~"HDI (high)",
  "non-answer < low certainty" , "relevance", nonAns_VS_low$post_prob, nonAns_VS_low$l_ci, nonAns_VS_low$u_ci,  
  "low certain < high certain" , "relevance", low_VS_high$post_prob, low_VS_high$l_ci, low_VS_high$u_ci,  
  "high certain < exhaustive" , "relevance", high_VS_exh$post_prob, high_VS_exh$l_ci, high_VS_exh$u_ci,  
  "answer: pos < neg", "relevance", AnswerPolarity_main$post_prob, AnswerPolarity_main$l_ci, AnswerPolarity_main$u_ci,
  "context: neutral > pos", "relevance", ContextType_neutral_positive$post_prob, ContextType_neutral_positive$l_ci, ContextType_neutral_positive$u_ci,
  "context: neutral > neg", "relevance", ContextType_neutral_negative$post_prob, ContextType_neutral_negative$l_ci, ContextType_neutral_negative$u_ci
)

write_csv(cellComparisons,
          "R_data_4_TeX/results-relevance-ratings.csv", col_names = T)

knitr::kable(cellComparisons)
comparison measure posterior HDI (low) HDI (high)
non-answer < low certainty relevance 1.00000 1.8335325 2.5278409
low certain < high certain relevance 0.99650 0.1366539 0.8850350
high certain < exhaustive relevance 1.00000 0.9448637 1.8827872
answer: pos < neg relevance 0.43075 -0.3106660 0.2514171
context: neutral > pos relevance 0.98550 0.0182465 0.3228557
context: neutral > neg relevance 0.88975 -0.0535692 0.2233926

The table shows results indicating that there are (non-surprising) effects of AnswerType with non-answers rated as least relevant, followed by low-certainty, then high-certainty answers, and final exhaustive answers. There is no (strong) indication for a main effect of AnswerPolarity or ContextType. The lack of an effect of ContextType might be interpreted as prima facie evidence in favor of quantitative notions of relevance that do not take the prior into account (at least not very prominently).

Here is a plot of the relevant posterior draws visually supporting why we compared the three factor levels of ContextType in the way we did (positive is the lowest, neutral the highest, but this difference is still not strongly indicative of a difference (0 included in HDI)):

Code
draws_ContextType <- 
  cbind(
    filter_cell_draws(fit_relevance_expFactors, ContextType == "positive", colname = "positive")[1],
    filter_cell_draws(fit_relevance_expFactors, ContextType == "negative", colname = "negative")[1],
    filter_cell_draws(fit_relevance_expFactors, ContextType == "neutral" , colname = "neutral" )[1]
  ) |> pivot_longer(cols = everything())

draws_ContextType |> 
  ggplot(aes(x = value, color = name, fill = name)) +
  geom_density(alpha = 0.3)

Addressing the main research hypotheses

Research hypotheses 1 and 2 are basic predictions in terms of simple measures of first- and second-order belief change. Research hypothesis 3 is about different notions of quantifying informational relevance.

Hypothesis 1: first-order belief change explains relevance ratings

The hypothesis is that higher belief changes (induced by the answer) lead to higher relevance ratings. We test this hypothesis by a linear beta regression model (with maximal random effects) that regresses relevance ratings against the absolute difference between prior and posterior ratings (ProbabilityChange). We judge there to be evidence in favor of this hypothesis if the relevant slope coefficient is estimated to be credibly bigger than zero (posterior probability > 0.944; an arbitrary value to indicate that there is nothing special about 0.95) and a loo-based model comparison with an intercept only model substantially favors the model that includes the relevant slope.

Run the regression model:

Code
if (rerun_models) {
  
  fit_belief_diff <- ordbetareg(
    formula = relevance_sliderResponse ~ ProbabilityChange + 
      (1 + ProbabilityChange | submission_id) + (1 + ProbabilityChange | StimID),
    data = d
  )
  
  fit_belief_diff_interceptOnly <- ordbetareg(
    formula = brms::bf(relevance_sliderResponse ~ 1, center= FALSE),
    data = d |> mutate(Int = 1)
  )
  
  write_rds(fit_belief_diff, "cachedModels-round2/fit_belief_diff.rds")
  write_rds(fit_belief_diff_interceptOnly, "cachedModels-round2/fit_belief_diff_interceptOnly.rds")
  
} else {
  fit_belief_diff <- read_rds("cachedModels-round2/fit_belief_diff.rds")
  fit_belief_diff_interceptOnly <- read_rds("cachedModels-round2/fit_belief_diff_interceptOnly.rds")
}

Test hypothesis 1:

Code
hyp1_summary <- brms::hypothesis(fit_belief_diff, "ProbabilityChange > 0")
loo_comp_hyp1 <- loo_compare(
  list("w/__beliefDiff" = loo(fit_belief_diff),
       "w/o_beliefDiff" = loo(fit_belief_diff_interceptOnly)))
loo_comp_hyp1
               elpd_diff se_diff
w/__beliefDiff    0.0       0.0 
w/o_beliefDiff -348.9      23.9 
Code
# Lambert's z-score test
Lambert_test(loo_comp_hyp1)
[1] 0
Code
post_samples <- hyp1_summary$samples |> pull(H1)
HDIs <- HDInterval::hdi(post_samples, credMass = 0.944)

myvars["hyp1PosteriorMean"] <- mean(post_samples) |> round(3)
myvars["hyp1PosteriorHypothesis"] <- hyp1_summary$hypothesis[1,"Post.Prob"]
myvars["hyp1HDILow"] <- HDIs['lower'] |> round(3)
myvars["hyp1HDIHigh"] <- HDIs['upper'] |> round(3)
myvars["hyp1LOODiff"] <- -1* (loo_comp_hyp1[2,1] |> round(3))
myvars["hyp1LOOSE"] <- loo_comp_hyp1[2,2] |> round(3)
myvars["hyp1LOOPValue"] <- Lambert_test(loo_comp_hyp1) |> round(3)

We find support for hypothesis one.

Hypothesis 2: confidence change additionally contributes to relevance rating

We also hypothesize that change in confidence (CommitmentChange) ratings additionally contributes to predicting relevance ratings. Concretely, we address this hypothesis with a linear beta regression model like for hypothesis 1, but also including the absolute difference in confidence ratings for before and after the answer (and the interaction term). We use the maximal RE-structure. We speak of evidence in favor of this hypothesis if the relevant posterior slope parameter is credibly bigger than zero and a loo-based model comparison favors the more complex model. We speak of evidence against this hypothesis if the loo-based model comparison favors the simpler model.

Run the regression model:

Code
if (rerun_models) {
  
  fit_confidence_diff <- ordbetareg(
    formula = relevance_sliderResponse ~ ProbabilityChange * CommitmentChange + 
      (1 + ProbabilityChange * CommitmentChange | submission_id) + 
      (1 + ProbabilityChange * CommitmentChange | StimID),
    data = d,
    control = list(adapt_delta = 0.95)
  )
  
  write_rds(fit_confidence_diff, "cachedModels-round2/fit_confidence_diff.rds")
  
} else {
  fit_confidence_diff <- read_rds("cachedModels-round2/fit_confidence_diff.rds")
}

Test Hypothesis 2:

Code
hyp2_summary <- brms::hypothesis(fit_confidence_diff, "CommitmentChange > 0")
loo_comp_hyp2 <- loo_compare(list("w/__confidence" = loo(fit_confidence_diff),
                                  "w/o_confidence" = loo(fit_belief_diff)))
loo_comp_hyp2
               elpd_diff se_diff
w/__confidence   0.0       0.0  
w/o_confidence -53.1      11.1  
Code
# Lambert's z-score test
Lambert_test(loo_comp_hyp2)
[1] 0
Code
post_samples <- hyp2_summary$samples |> pull(H1)
HDIs <- HDInterval::hdi(post_samples, credMass = 0.944)

myvars["hyp2PosteriorMean"] <- mean(post_samples) |> round(3)
myvars["hyp2PosteriorHypothesis"] <- hyp2_summary$hypothesis[1,"Post.Prob"]
myvars["hyp2HDILow"] <- HDIs['lower'] |> round(3)
myvars["hyp2HDIHigh"] <- HDIs['upper'] |> round(3)
myvars["hyp2LOODiff"] <- -1* (loo_comp_hyp2[2,1] |> round(3))
myvars["hyp2LOOSE"] <- loo_comp_hyp2[2,2] |> round(3)
myvars["hyp2LOOPValue"] <- Lambert_test(loo_comp_hyp2) |> round(3)

We find support for hypothesis two.

Hypothesis 3: “Bayes Factor utility” is the best single-factor predictor of relevance ratings

The third hypothesis is that the BayesFactor is a better (single-factor, linear) predictor of relevance_sliderResponse than KLUtility and EntropyChange. We address this hypothesis with LOO cross-validation. We also directly include the exploratory hypothesis 1 here, thus comparing all single-factor models.

Code
get_single_factor_formula <- function(factor) {
  # get formula for single factor with full RE structure
  brms::brmsformula(
    str_c("relevance_sliderResponse ~ (1 + ", factor, " | submission_id)",
    "+ (1 + ", factor, " | StimID) + ", factor))
}

if (rerun_models) {
  
  # ER
  fit_loo_ER <- ordbetareg(
    get_single_factor_formula("EntropyChange"),
    iter = 6000,
    save_pars = save_pars(all = T),
    data = d,
    control = list(adapt_delta = 0.9)
  ) |> add_criterion("loo", model_name = "ER", moment_match = T)
  
  # KL
  fit_loo_KL <- ordbetareg(
    get_single_factor_formula("KLUtility"),
    iter = 6000,
    save_pars = save_pars(all = T),
    data = d,
    control = list(adapt_delta = 0.8)
  ) |> add_criterion("loo", model_name = "KL")
  
  # BF
  fit_loo_BF <- ordbetareg(
    get_single_factor_formula("BayesFactor"),
    iter = 6000,
    save_pars = save_pars(all = T),
    data = d,
    control = list(adapt_delta = 0.8)
  ) |> add_criterion("loo", model_name = "BF")
  
  # ER_beta
  fit_loo_ER_beta <- ordbetareg(
    get_single_factor_formula("EntropyChange_2ndOrder"),
    iter = 6000,
    save_pars = save_pars(all = T),
    data = d,
    control = list(adapt_delta = 0.85)
  ) |> add_criterion("loo", model_name = "ER_beta")
  
  # KL_beta
  fit_loo_KL_beta <- ordbetareg(
    get_single_factor_formula("KLUtility_2ndOrder"),
    iter = 6000,
    save_pars = save_pars(all = T),
    data = d,
    control = list(adapt_delta = 0.8)
  ) |> add_criterion("loo", model_name = "KL_beta")
  
  # BF_beta
  fit_loo_BF_beta <- ordbetareg(
    get_single_factor_formula("BayesFactor_2ndOrder"),
    iter = 6000,
    save_pars = save_pars(all = T),
    data = d,
    control = list(adapt_delta = 0.95)
  ) |> add_criterion("loo", model_name = "BF_beta")
  
  # belief_diff 
  fit_loo_belief_diff <- ordbetareg(
    get_single_factor_formula("ProbabilityChange"),
    iter = 6000,
    save_pars = save_pars(all = T),
    data = d,
    control = list(adapt_delta = 0.8)
  ) |> add_criterion("loo", model_name = "belief_diff")
  
  # confidence_diff 
  fit_loo_confidence_diff <- ordbetareg(
    get_single_factor_formula("CommitmentChange"),
    iter = 6000,
    save_pars = save_pars(all = T),
    data = d,
    control = list(adapt_delta = 0.8)
  ) |> add_criterion("loo", model_name = "confidence_diff")
  
  # concentration change 
  fit_loo_concentration_change <- ordbetareg(
    get_single_factor_formula("ConcentrationChange"),
    iter = 6000,
    save_pars = save_pars(all = T),
    data = d,
    control = list(adapt_delta = 0.9)
  ) |> add_criterion("loo", model_na9e = "concentration_change")
  
  write_rds(fit_loo_ER, "cachedModels-round2/fit_loo_ER.rds")
  write_rds(fit_loo_KL, "cachedModels-round2/fit_loo_KL.rds")
  write_rds(fit_loo_BF, "cachedModels-round2/fit_loo_BF.rds")
  write_rds(fit_loo_ER_beta, "cachedModels-round2/fit_loo_ER_beta.rds")
  write_rds(fit_loo_KL_beta, "cachedModels-round2/fit_loo_KL_beta.rds")
  write_rds(fit_loo_BF_beta, "cachedModels-round2/fit_loo_BF_beta.rds")
  write_rds(fit_loo_belief_diff, "cachedModels-round2/fit_loo_belief_diff.rds")
  write_rds(fit_loo_confidence_diff, "cachedModels-round2/fit_loo_confidence_diff.rds")
  write_rds(fit_loo_concentration_change, "cachedModels-round2/fit_loo_conccentration_change.rds")
  
} else{
  fit_loo_ER <- read_rds("cachedModels-round2/fit_loo_ER.rds")
  fit_loo_KL <- read_rds("cachedModels-round2/fit_loo_KL.rds")
  fit_loo_BF <- read_rds("cachedModels-round2/fit_loo_BF.rds")
  fit_loo_ER_beta <- read_rds("cachedModels-round2/fit_loo_ER_beta.rds")
  fit_loo_KL_beta <- read_rds("cachedModels-round2/fit_loo_KL_beta.rds")
  fit_loo_BF_beta <- read_rds("cachedModels-round2/fit_loo_BF_beta.rds")
  fit_loo_belief_diff <- read_rds("cachedModels-round2/fit_loo_belief_diff.rds")
  fit_loo_confidence_diff <- read_rds("cachedModels-round2/fit_loo_confidence_diff.rds")
}

LOO-based model comparison:

Code
loo_compare(
 fit_loo_ER, 
 fit_loo_KL, 
 fit_loo_BF,
 fit_loo_ER_beta, 
 fit_loo_KL_beta, 
 fit_loo_BF_beta,
 fit_loo_belief_diff,
 fit_loo_confidence_diff,
 fit_loo_concentration_change
)
                             elpd_diff se_diff
fit_loo_BF                      0.0       0.0 
fit_loo_KL_beta              -201.4      20.8 
fit_loo_ER                   -203.0      23.8 
fit_loo_KL                   -219.1      16.8 
fit_loo_BF_beta              -273.7      37.1 
fit_loo_ER_beta              -314.2      35.2 
fit_loo_belief_diff          -359.2      29.5 
fit_loo_concentration_change -489.1      36.9 
fit_loo_confidence_diff      -560.8      37.0 
Code
model_names <- c("probability change", "entropy change", "KL utility", "BF utility", 
                 "commitment change", "concentration change", "entropy change (2nd-order)", 
                 "KL utility (2nd-order)", "BF utility (2nd-order)")
tibble(
  model = factor(model_names, levels = rev(model_names)),
  level = c(rep("first-order", times = 4), rep("second-order", times = 5)),
  ELPD  = c(
    loo(fit_loo_belief_diff)$estimates['elpd_loo','Estimate'],
    loo(fit_loo_ER)$estimates['elpd_loo','Estimate'], 
    loo(fit_loo_KL)$estimates['elpd_loo','Estimate'],
    loo(fit_loo_BF)$estimates['elpd_loo','Estimate'],
    loo(fit_loo_confidence_diff)$estimates['elpd_loo','Estimate'],
    loo(fit_loo_concentration_change)$estimates['elpd_loo','Estimate'],
    loo(fit_loo_ER_beta)$estimates['elpd_loo','Estimate'],  
    loo(fit_loo_KL_beta)$estimates['elpd_loo','Estimate'], 
    loo(fit_loo_BF_beta)$estimates['elpd_loo','Estimate']
  ),
  SE = c(
    loo(fit_loo_belief_diff)$estimates['elpd_loo','SE'],
    loo(fit_loo_ER)$estimates['elpd_loo','SE'], 
    loo(fit_loo_KL)$estimates['elpd_loo','SE'],
    loo(fit_loo_BF)$estimates['elpd_loo','SE'],
    loo(fit_loo_confidence_diff)$estimates['elpd_loo','SE'],
    loo(fit_loo_concentration_change)$estimates['elpd_loo','SE'],
    loo(fit_loo_ER_beta)$estimates['elpd_loo','SE'],  
    loo(fit_loo_KL_beta)$estimates['elpd_loo','SE'], 
    loo(fit_loo_BF_beta)$estimates['elpd_loo','SE']
  ),
  lower = ELPD - SE,
  upper = ELPD + SE
) |> 
  ggplot(aes( x = model , y = ELPD, color = level)) +
  geom_linerange(aes(min = lower, max = upper), size = 0.5) +
  geom_point(aes(shape = level), size = 2.5) +
  coord_flip() +
  xlab("") +
  ylab("expected log likelihood (LOO-CV) ")

Code
ggsave("plots/comparison-single-factors.pdf", scale = 0.8, width = 8, height = 4)

Testing whether there is a substantial difference between the two best models:

Code
loo_comp_hyp3 <- loo_compare(fit_loo_BF, fit_loo_BF_beta)
loo_comp_hyp3
                elpd_diff se_diff
fit_loo_BF         0.0       0.0 
fit_loo_BF_beta -273.7      37.1 
Code
Lambert_test(loo_comp_hyp3)
[1] 0
Code
# Comparing the best model to the second best model
myvars["hyp3BFvsBFBetaLOODiff"] <- -1* (loo_comp_hyp3[2,1] |> round(3))
myvars["hyp3BFvsBFBetaLOOSE"] <- loo_comp_hyp3[2,2] |> round(3)
myvars["hyp3BFvsBFBetaPValue"] <- Lambert_test(loo_comp_hyp3) |> round(3)

Yes, there is a noteworthy difference.

We can conclude that first-order BF is the single best predictor of the relevance ratings.

Addressing the exploratory hypotheses

Exploratory Hypothesis 2: adding ConcentrationChange to all first-order measures

To complement our confirmatory hypothesis 2, we also explore whether adding another measure of higher-order uncertainty change adds predictive performance to each first-order measure of belief change. So here we compare, for each measure \(X\) (“entropy change”, “KL”, and “Bayes factor”) for first-order belief change, whether adding the factor ConcentrationChange increases the predictive performance. Concretely, we compare a model with single factor \(X\) as a predictor to a model with predictors \(X\), ConcentrationChange and their interaction. For ease of fitting, no random effects are included.

First, run the relevant models.

Code
get_formula_two_factors <- function(factor1, factor2) {
  # brms::brmsformula(
  #   str_c(
  #     "relevance_sliderResponse ~ (1 + ", factor1, " + " ,  factor2, " | submission_id)",
  #     "+ (1 + ", factor1, " + " , factor2, " | StimID) + ", 
  #     factor1, " * ", factor2))
  if (factor2 == "") {
    brms::brmsformula(
      str_c(
        "relevance_sliderResponse ~ ", factor1))  
  } else {
    brms::brmsformula(
    str_c(
      "relevance_sliderResponse ~ ", factor1, " * " ,  factor2))
  }
}

# get_formula_two_factors("X", "Y")
# get_formula_two_factors("X", "")

fit_model_two_factors <- function(name, factor1, factor2) {
  filename <- paste0(c("cachedModels-round2/", name, ".rds"), collapse = "")
  if (rerun_models) {
    x <- ordbetareg(
      get_formula_two_factors(factor1, factor2),
      iter = 2000,
      save_pars = save_pars(all = T),
      data = d,
      control = list(adapt_delta = 0.9)
    ) |> add_criterion("loo", model_name = name, moment_match = T)
    write_rds(x, filename) 
    return(x)
  } else{
    read_rds(filename) 
  }
  
}

# ER
fit_expHyp2_ER <- fit_model_two_factors(
  "fit_expHyp2_ER", 
  "EntropyChange", 
  "")
fit_expHyp2_ER_pure <- fit_model_two_factors(
  "fit_expHyp2_ER_pure", 
  "EntropyChange", 
  "ConcentrationChange")


#KL
fit_expHyp2_KL <- fit_model_two_factors(
  "fit_expHyp2_KL", 
  "KLUtility", 
  "")
fit_expHyp2_KL_pure <- fit_model_two_factors(
  "fit_expHyp2_KL_pure", 
  "KLUtility", 
  "ConcentrationChange")


#BF
fit_expHyp2_BF <- fit_model_two_factors(
  "fit_expHyp2_BF", 
  "BayesFactor", 
  "")
fit_expHyp2_BF_pure <- fit_model_two_factors(
  "fit_expHyp2_BF_pure", 
  "BayesFactor", 
  "ConcentrationChange")

Then compare the relevant pairs with LOO:

Code
(loo_c_explHyp2_ER <- loo_compare(fit_expHyp2_ER, fit_expHyp2_ER_pure))
                    elpd_diff se_diff
fit_expHyp2_ER_pure   0.0       0.0  
fit_expHyp2_ER      -38.2       9.1  
Code
(loo_c_explHyp2_KL <- loo_compare(fit_expHyp2_KL, fit_expHyp2_KL_pure))
                    elpd_diff se_diff
fit_expHyp2_KL_pure   0.0       0.0  
fit_expHyp2_KL      -44.1       9.4  
Code
(loo_c_explHyp2_BF <- loo_compare(fit_expHyp2_BF, fit_expHyp2_BF_pure))
                    elpd_diff se_diff
fit_expHyp2_BF_pure   0.0       0.0  
fit_expHyp2_BF      -23.2       6.4  

It appears that for all three measures of first-order belief change, adding ConcentrationChange yields a substantially better model.

Code
myvars["expHyp2ERLOODiff"] <- -1* (loo_c_explHyp2_ER[2,1] |> round(3))
myvars["expHyp2ERLOOSE"]   <- loo_c_explHyp2_ER[2,2] |> round(3)
  
myvars["expHyp2KLLOODiff"] <-  -1* (loo_c_explHyp2_KL[2,1] |> round(3))
myvars["expHyp2KLLOOSE"]   <-  loo_c_explHyp2_KL[2,2] |> round(3)

myvars["expHyp2BFLOODiff"] <-  -1* (loo_c_explHyp2_BF[2,1] |> round(3))
myvars["expHyp2BFLOOSE"]   <-    loo_c_explHyp2_BF[2,2] |> round(3)

Exploratory Hypothesis 3: compare all combinations of first- and second-order measures

Finally, we just compare models with all combinations of first- and second-order measures. Questions of interest are:

  1. Which arbitrary combination of first- and second-order measures is the best?
  2. Does it matter to be consistent in the choice of first- and second order measure, i.e., is the performance of “first-order X” always most boosted when we supply it with “second-order X” instead of some other “second-order Y”?

Let’s run the models first. For ease of fitting, no random effects are included.

Code
# base-level: pure

fit_loo_pure_ER <- fit_model_two_factors(
  "fit_loo_pure_ER",  "ProbabilityChange",  "EntropyChange_2ndOrder")

fit_loo_pure_KL <- fit_model_two_factors(
  "fit_loo_pure_KL",  "ProbabilityChange",  "KLUtility_2ndOrder")

fit_loo_pure_BF <- fit_model_two_factors(
  "fit_loo_pure_BF",  "ProbabilityChange",  "BayesFactor_2ndOrder")

fit_loo_pure_pure <- fit_model_two_factors(
  "fit_loo_pure_pure",  "ProbabilityChange",  "ConcentrationChange")

# base-level: ER

fit_loo_ER_ER <- fit_model_two_factors(
  "fit_loo_ER_ER",  "EntropyChange",  "EntropyChange_2ndOrder")

fit_loo_ER_KL <- fit_model_two_factors(
  "fit_loo_ER_KL",  "EntropyChange",  "KLUtility_2ndOrder")

fit_loo_ER_BF <- fit_model_two_factors(
  "fit_loo_ER_BF",  "EntropyChange",  "BayesFactor_2ndOrder")

fit_loo_ER_pure <- fit_model_two_factors(
  "fit_loo_ER_pure",  "EntropyChange",  "ConcentrationChange")

# base-level: KL

fit_loo_KL_ER <- fit_model_two_factors(
  "fit_loo_KL_ER",  "KLUtility",  "EntropyChange_2ndOrder")

fit_loo_KL_KL <- fit_model_two_factors(
  "fit_loo_KL_KL",  "KLUtility",  "KLUtility_2ndOrder")

fit_loo_KL_BF <- fit_model_two_factors(
  "fit_loo_KL_BF",  "KLUtility",  "BayesFactor_2ndOrder")

fit_loo_KL_pure <- fit_model_two_factors(
  "fit_loo_KL_pure",  "KLUtility",  "ConcentrationChange")

# base-level: BF

fit_loo_BF_ER <- fit_model_two_factors(
  "fit_loo_BF_ER",  "BayesFactor",  "EntropyChange_2ndOrder")

fit_loo_BF_KL <- fit_model_two_factors(
  "fit_loo_BF_KL",  "BayesFactor",  "KLUtility_2ndOrder")

fit_loo_BF_BF <- fit_model_two_factors(
  "fit_loo_BF_BF",  "BayesFactor",  "BayesFactor_2ndOrder")

fit_loo_BF_pure <- fit_model_two_factors(
  "fit_loo_BF_pure",  "BayesFactor",  "ConcentrationChange")

Compare all models.

Code
loo_compare(
 fit_loo_pure_ER, 
 fit_loo_pure_KL, 
 fit_loo_pure_BF, 
 fit_loo_pure_pure,
 fit_loo_ER_ER, 
 fit_loo_ER_KL, 
 fit_loo_ER_BF, 
 fit_loo_ER_pure,
 fit_loo_KL_ER, 
 fit_loo_KL_KL, 
 fit_loo_KL_BF,
 fit_loo_KL_pure,
 fit_loo_BF_ER, 
 fit_loo_BF_KL, 
 fit_loo_BF_BF,
 fit_loo_BF_pure
)
                  elpd_diff se_diff
fit_loo_BF_BF        0.0       0.0 
fit_loo_BF_ER      -49.6      13.0 
fit_loo_BF_pure    -73.5      11.6 
fit_loo_BF_KL      -98.6      13.8 
fit_loo_KL_BF     -131.8      13.4 
fit_loo_ER_BF     -139.3      18.6 
fit_loo_KL_ER     -202.3      20.8 
fit_loo_ER_KL     -218.0      19.2 
fit_loo_pure_BF   -223.5      23.1 
fit_loo_ER_ER     -243.3      25.6 
fit_loo_pure_ER   -245.4      27.2 
fit_loo_pure_KL   -246.4      19.2 
fit_loo_KL_KL     -264.9      18.6 
fit_loo_ER_pure   -268.0      24.7 
fit_loo_KL_pure   -271.3      20.8 
fit_loo_pure_pure -370.4      29.3 

Plot results.

Code
model_names <- c(
 "ProbCh + ER     ", "ProbCh + KL     ", "ProbCh + BF     ", "ProbCh + ConcCh ",
 "    ER + ER     ", "    ER + KL     ", "    ER + BF     ", "    ER + ConcCh ",
 "    KL + ER     ", "    KL + KL     ", "    KL + BF     ", "    KL + ConcCh ",
 "    BF + ER     ", "    BF + KL     ", "    BF + BF     ", "    BF + ConcCh "
  )

tibble(
  model = factor(model_names, levels = rev(model_names)),
  ELPD  = c(
    loo(fit_loo_pure_ER)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_pure_KL)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_pure_BF)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_pure_pure)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_ER_ER)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_ER_KL)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_ER_BF)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_ER_pure)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_KL_ER)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_KL_KL)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_KL_BF)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_KL_pure)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_BF_ER)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_BF_KL)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_BF_BF)$estimates['elpd_loo', 'Estimate'],
    loo(fit_loo_BF_pure)$estimates['elpd_loo', 'Estimate']
  ),
  SE = c(
    loo(fit_loo_pure_ER)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_pure_KL)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_pure_BF)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_pure_pure)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_ER_ER)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_ER_KL)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_ER_BF)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_ER_pure)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_KL_ER)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_KL_KL)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_KL_BF)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_KL_pure)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_BF_ER)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_BF_KL)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_BF_BF)$estimates['elpd_loo', 'SE'],
    loo(fit_loo_BF_pure)$estimates['elpd_loo', 'SE']
  ),
  lower = ELPD - SE,
  upper = ELPD + SE
) |> 
  ggplot(aes( x = model , y = ELPD)) +
  geom_pointrange(aes(min = lower, max = upper)) +
  coord_flip() +
  xlab("") +
  ylab("expected log likelihood (LOO-CV)") +
  theme(axis.text.y = element_text(family = "mono"))

Code
ggsave("plots/comparison-exploratory-hyp3.pdf", scale = 1, width = 8, height = 4)
Code
(loo_comp_explHyp3 <- loo_compare(loo(fit_loo_BF_BF), loo(fit_loo_BF_pure)))
                elpd_diff se_diff
fit_loo_BF_BF     0.0       0.0  
fit_loo_BF_pure -73.5      11.6  
Code
myvars["explHyp3LOODiff"] <- -1* (loo_comp_explHyp3[2,1] |> round(3))
myvars["explHyp3LOOSE"] <- loo_comp_explHyp3[2,2] |> round(3)
myvars["explHyp3PValue"] <- Lambert_test(loo_comp_explHyp3) |> round(3)

It seems that the overall best model in this comparison is the one that uses Bayes-factor based measures consistently. The second-order Bayes-factor based measure also seems to be the best to add to the other first-order measures. This also means that it is not the case the “being consisten” in choic of first- and second-order measure is always best.

Saving variables for LaTeX

Visualizing all results:

Code
library(viridis)
Code
d |> 
    ggplot(aes(x = prior_sliderResponse, y = posterior_sliderResponse, color = relevance_sliderResponse))  +
    geom_point(size = 3, alpha = 0.5) + 
    scale_color_viridis()

And let’s zoom in on some interesting spots:

Code
d |> 
    ggplot(aes(x = prior_sliderResponse, y = posterior_sliderResponse, color = relevance_sliderResponse))  +
    geom_jitter(width = 0.01, height = 0.01, size = 3, alpha = 0.7) + 
    scale_color_viridis() + 
    coord_cartesian(xlim = c(0, 0.25), ylim = c(0, 0.25))

Note: redo these plots but show delta of relevance_sliderResponse and BF etc.

And we do the same thing with the confidence ratings TODO: Sanity check whether condition with high bias prior and contradictory answer lead to decrease in confidence.

Code
d |> 
    ggplot(aes(x = prior_confidence, y = posterior_confidence, color = relevance_sliderResponse))  +
    geom_jitter(width = 0.4, height = 0.4, size = 3, alpha = 0.4) + 
    scale_color_viridis()

And now we can plot delta confidence and delta probability as x and y TODO: try removing non-answers from these (and other) plots

Code
d |> 
    ggplot(aes(x = ProbabilityChange, y = CommitmentChange, color = relevance_sliderResponse))  +
    geom_jitter(width = 0.0, height = 0.3, size = 3, alpha = 0.4) + 
    scale_color_viridis()

Code
d |> 
    ggplot(aes(x = ProbabilityChange, y = CommitmentChange, color = relevance_sliderResponse))  +
    geom_jitter(width = 0.01, height = 0.3, size = 3, alpha = 0.4) + 
    scale_color_viridis() + 
    coord_cartesian(xlim = c(-0.01, 0.15), ylim = c(-0.3, 4))